# Replication file for: Panem et Circenses: Removing Political News to Generate Electoral Support, Evidence from Berlusconi’s Italy
source(here::here("src","00_init.R"))

# Load TV transcripts' dataset
load(file.path(here::here(), "data", "processed","dat_transcripts_alltopics.Rdata"))

# Load previously created stopword list
load(file=file.path(here::here(), "data", "processed","stopw.file.Rdata"))
stopw <- iconv(stopw, to = "UTF-8")   # Ensuring UTF-8


# Scaling ALL TOPICS --------------------------------------------------------------------

# All news (no selection)
table(dat$MacroArgomento)

# Paste all transcripts from the same edition:
dat.edition <- aggregate(dat$text, 
                         list(dat$doc_name_long), 
                         paste, 
                         collapse=" ")
names(dat.edition) <- c("doc_id","text")

# Get relevant info from the news-story-level data:
dat.tmp <- subset(dat,
                  select=c(Titolo, Edizione, date, canale, doc_name_long))

# Delete duplicates:
dat.tmp <- dat.tmp[!duplicated(dat.tmp),]
# Join with the edition-level text previously pasted:
dat.edition <- dplyr::left_join(x=dat.edition, 
                                y=dat.tmp, 
                                by=c("doc_id"="doc_name_long"))
rm(dat.tmp)


library(tm)
# From: https://cran.r-project.org/web/packages/tm/vignettes/extensions.pdf
# readTabular no longer needed in newest tm version:
# m <- list(content = "text",   # has to be "content"
#           Titolo="Titolo", Edizione="Edizione",date="date",
#           canale="canale", id="id")
# myReader <- readTabular(mapping = m)
corpus.tm <- VCorpus(DataframeSource(dat.edition))

# Avoid usign some quanteda function
detach("package:quanteda", character.only = T)

# Text pre-processing:
#       1. Removing numbers
#       2. Removing stopwords 
#       3. Stemming
corpus.tm <- tm_map(corpus.tm, removeNumbers)
corpus.tm <- tm_map(corpus.tm, removeWords, stopw)
corpus.tm <- tm_map(corpus.tm, stemDocument, language = "italian")


# Creating the dfm
library("quanteda")
corpus <- quanteda::corpus(corpus.tm)
dfm <- dfm(corpus)
dfm.mat <- as.matrix(dfm)

### Filter out words:
Bottom <- quantile(WordsUsage, probs = c(0.05))
Top <- quantile(WordsUsage, probs = c(0.95))
WordsToRemove <- which(WordsUsage<Bottom | WordsUsage > Top)
dfm.mat <- dfm.mat[, -WordsToRemove]

### Filter out documents:
# delete documents with < 20 words
WordsInDocs <- apply(X = dfm.mat, 1, sum)
dfm.mat <- dfm.mat[-which(WordsInDocs < 20),]

save(dfm.mat, file="dfm.mat.alltopics.Rdata")


# # Aggregation of news transcripts by day --------------------------------------------------
# library(plyr)
# 
# # Pasted (daily) dataset:
# dat.paste <- aggregate(dat$Text, 
#                        list(dat$id),
#                        paste,
#                        collapse=" ")
# names(dat.paste) <- c("id", "text")


# Corpus preprocessing -----------------------------------------
# (English-based pre-processing packages will fail, e.g.: "l'amico" -> "lamico)
# Following creates smaller sample in case needed:
# dat.paste <- dat.paste[sample(1:nrow(dat.paste),200),]
# All the code was run on a multicore cluster
# 
# 
# library(tm)
# # Split to lists to vectorize:
# dat.paste.ls <- split(dat.paste, dat.paste$id)
# TVcorpus <- VCorpus(VectorSource(dat.paste$text))
# 
# for (i in 1:length(TVcorpus)){
#         # Assign id tag
#         DublinCore(TVcorpus[[i]], tag = "id") <- as.character(unlist(dat.paste.ls[[i]][1]))
# }
# 
# TVcorpus <- tm_map(TVcorpus, stripWhitespace)
# TVcorpus <- tm_map(TVcorpus, content_transformer(tolower))
# TVcorpus <- tm_map(TVcorpus, removeWords, stopw) # run this on a small sample or on cluster
# TVcorpus <- tm_map(TVcorpus, removePunctuation)
# TVcorpus <- tm_map(TVcorpus, removeNumbers)
# TVcorpus <- tm_map(TVcorpus, stemDocument, language="italian")
# 
# save(TVcorpus,
#      file=file.path(here::here(), "data", "processed","TVcorpus.Rdata"))


# Creating the data feature matrix ---------------------------------------------
library(tm)
load(file=file.path(here::here(), "data", "processed","TVcorpus.Rdata"))

dfm.tm <- TermDocumentMatrix(TVcorpus)      # in wordfish the matrix has the words in the rownames
# Removing sparse terms: 
# https://stackoverflow.com/questions/28763389/how-does-the-removesparseterms-in-r-work
dfm.tm <- removeSparseTerms(dfm.tm, 0.95)
dfm.tm <- as.matrix(dfm.tm)


# Remove bottom and top 5% of words by frequency
WordsUsage <- apply(dfm.tm, 1, sum)
Bottom5 <- quantile(WordsUsage, probs = c(0.5))
Top5 <- quantile(WordsUsage, probs = c(0.95))
WordsToRemove <- which(WordsUsage<Bottom5|WordsUsage>Top5)
dfm.tm <- dfm.tm[-WordsToRemove,]

RemoveCols <- which(apply(dfm.tm,2,sum)<20)     # Remove documents with less than 20 words
colnames(dfm.tm)[RemoveCols]
dfm.tm <- dfm.tm[,-RemoveCols]
dim(dfm.tm)
rm(Bottom5,RemoveCols,Top5,WordsToRemove,WordsUsage)
save(dfm.tm, 
     file=file.path(here::here(), "data", "processed","dfm.tm.RData"))


dfm.tm <- t(dfm.tm)
dfm <- as.dfm(dfm.tm)
save(dfm, 
     file=file.path(here::here(), "data", "processed","dfm.RData"))



# Running wordfish text scaling algorithm -------------------------------------

# Wordfish on all topics --------
library(quanteda)
load(file=file.path(here::here(), "data", "processed","dfm.RData"))

results <- textmodel_wordfish(dfm)
save(results, 
     file=file.path(here::here(), "output", "results_wordfish.RData"))

